library(car)
## Loading required package: carData
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readxl)
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
#load the dataset
mydata <- read.csv('./data/day.csv')
#convert 'dteday' column to Date format
mydata$dteday <- as.Date(mydata$dteday)
#season
mydata$season <- cut(mydata$season,
breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
labels = c("Winter", "Spring", "Summer", "Fall"))
mydata$season <- factor(mydata$season, levels = c("Winter", "Spring", "Summer", "Fall"))
#workingday
mydata$workingday <- ifelse(mydata$workingday == 0, "Not_Workingday", "Workingday")
mydata$workingday <- factor(mydata$workingday, levels = c("Not_Workingday", "Workingday"))
#weather
mydata$weathersit <- cut(mydata$weathersit,
breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
labels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))
mydata$weathersit <- factor(mydata$weathersit, levels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))
head(mydata)
## instant dteday season yr mnth holiday weekday workingday weathersit
## 1 1 2011-01-01 Winter 0 1 0 6 Not_Workingday Weather_2
## 2 2 2011-01-02 Winter 0 1 0 0 Not_Workingday Weather_2
## 3 3 2011-01-03 Winter 0 1 0 1 Workingday Weather_1
## 4 4 2011-01-04 Winter 0 1 0 2 Workingday Weather_1
## 5 5 2011-01-05 Winter 0 1 0 3 Workingday Weather_1
## 6 6 2011-01-06 Winter 0 1 0 4 Workingday Weather_1
## temp atemp hum windspeed casual registered cnt
## 1 0.344167 0.363625 0.805833 0.1604460 331 654 985
## 2 0.363478 0.353739 0.696087 0.2485390 131 670 801
## 3 0.196364 0.189405 0.437273 0.2483090 120 1229 1349
## 4 0.200000 0.212122 0.590435 0.1602960 108 1454 1562
## 5 0.226957 0.229270 0.436957 0.1869000 82 1518 1600
## 6 0.204348 0.233209 0.518261 0.0895652 88 1518 1606
The year 2012 dataset will be used for prediction.
# Set a seed for reproducibility
# Setting a seed is crucial for reproducibility and consistency of results.
# It ensures that when you run the same code multiple times, you'll ...
# ... obtain the same results.
set.seed(20231103)
# Filter dataset for year 2011 (0) and year 2012 (1)
year2011 <- mydata[mydata$yr == 0, ]
year2012 <- mydata[mydata$yr == 1, ]
# Add 'Type' column and assign values
year2011$Type <- "Training"
year2012$Type <- "Validation"
# Combine training and validation data
mydata <- rbind(year2011, year2012)
#creating training data our of mydata (training -> data from 2011)
training_data <- subset(mydata, Type == "Training")
summary(training_data)
## instant dteday season yr mnth
## Min. : 1 Min. :2011-01-01 Winter:90 Min. :0 Min. : 1.000
## 1st Qu.: 92 1st Qu.:2011-04-02 Spring:92 1st Qu.:0 1st Qu.: 4.000
## Median :183 Median :2011-07-02 Summer:94 Median :0 Median : 7.000
## Mean :183 Mean :2011-07-02 Fall :89 Mean :0 Mean : 6.526
## 3rd Qu.:274 3rd Qu.:2011-10-01 3rd Qu.:0 3rd Qu.:10.000
## Max. :365 Max. :2011-12-31 Max. :0 Max. :12.000
## holiday weekday workingday weathersit
## Min. :0.0000 Min. :0.000 Not_Workingday:115 Weather_1:226
## 1st Qu.:0.0000 1st Qu.:1.000 Workingday :250 Weather_2:124
## Median :0.0000 Median :3.000 Weather_3: 15
## Mean :0.0274 Mean :3.008 Weather_4: 0
## 3rd Qu.:0.0000 3rd Qu.:5.000
## Max. :1.0000 Max. :6.000
## temp atemp hum windspeed
## Min. :0.05913 Min. :0.07907 Min. :0.0000 Min. :0.02239
## 1st Qu.:0.32500 1st Qu.:0.32195 1st Qu.:0.5383 1st Qu.:0.13558
## Median :0.47917 Median :0.47285 Median :0.6475 Median :0.18690
## Mean :0.48666 Mean :0.46684 Mean :0.6437 Mean :0.19140
## 3rd Qu.:0.65667 3rd Qu.:0.61238 3rd Qu.:0.7421 3rd Qu.:0.23508
## Max. :0.84917 Max. :0.84090 Max. :0.9725 Max. :0.50746
## casual registered cnt Type
## Min. : 9.0 Min. : 416 Min. : 431 Length:365
## 1st Qu.: 222.0 1st Qu.:1730 1st Qu.:2132 Class :character
## Median : 614.0 Median :2915 Median :3740 Mode :character
## Mean : 677.4 Mean :2728 Mean :3406
## 3rd Qu.: 871.0 3rd Qu.:3632 3rd Qu.:4586
## Max. :3065.0 Max. :4614 Max. :6043
#creating training data our of mydata (validatioin -> data from 2012)
validation_data <- subset(mydata, Type == "Validation")
summary(validation_data)
## instant dteday season yr mnth
## Min. :366.0 Min. :2012-01-01 Winter:91 Min. :1 Min. : 1.000
## 1st Qu.:457.2 1st Qu.:2012-04-01 Spring:92 1st Qu.:1 1st Qu.: 4.000
## Median :548.5 Median :2012-07-01 Summer:94 Median :1 Median : 7.000
## Mean :548.5 Mean :2012-07-01 Fall :89 Mean :1 Mean : 6.514
## 3rd Qu.:639.8 3rd Qu.:2012-09-30 3rd Qu.:1 3rd Qu.: 9.750
## Max. :731.0 Max. :2012-12-31 Max. :1 Max. :12.000
## holiday weekday workingday weathersit
## Min. :0.00000 Min. :0.000 Not_Workingday:116 Weather_1:237
## 1st Qu.:0.00000 1st Qu.:1.000 Workingday :250 Weather_2:123
## Median :0.00000 Median :3.000 Weather_3: 6
## Mean :0.03005 Mean :2.986 Weather_4: 0
## 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :1.00000 Max. :6.000
## temp atemp hum windspeed
## Min. :0.1075 Min. :0.1017 Min. :0.2542 Min. :0.04665
## 1st Qu.:0.3477 1st Qu.:0.3507 1st Qu.:0.5081 1st Qu.:0.13372
## Median :0.5142 Median :0.4978 Median :0.6119 Median :0.17475
## Mean :0.5041 Mean :0.4819 Mean :0.6122 Mean :0.18957
## 3rd Qu.:0.6540 3rd Qu.:0.6076 3rd Qu.:0.7111 3rd Qu.:0.23120
## Max. :0.8617 Max. :0.8049 Max. :0.9250 Max. :0.44156
## casual registered cnt Type
## Min. : 2.0 Min. : 20 Min. : 22 Length:366
## 1st Qu.: 429.8 1st Qu.:3730 1st Qu.:4369 Class :character
## Median : 904.5 Median :4776 Median :5927 Mode :character
## Mean :1018.5 Mean :4581 Mean :5600
## 3rd Qu.:1262.0 3rd Qu.:5663 3rd Qu.:7011
## Max. :3410.0 Max. :6946 Max. :8714
We choose 3 questions of interest:
#registered
ggplot(mydata, aes(x = Type, y = registered, color = Type)) +
geom_boxplot(position = position_dodge(width = 0.8)) +
labs(x = "Type", y = "# Registered Users") +
scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
ggtitle("# Registered Users by Type (Training vs. Validation)") +
theme_bw()
#sqrt(casual)
ggplot(mydata, aes(x = Type, y = sqrt(casual), color = Type)) +
geom_boxplot(position = position_dodge(width = 0.8)) +
labs(x = "Type", y = "Square Root # Casual Users") +
scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
ggtitle("Square Root # Casual Users by Type (Training vs. Validation)") +
theme_bw()
#cnt
ggplot(mydata, aes(x = Type, y = cnt, color = Type)) +
geom_boxplot(position = position_dodge(width = 0.8)) +
labs(x = "Type", y = "# Total Users") +
scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
ggtitle("# Total Users by Type (Training vs. Validation)") +
theme_bw()
# Function to calculate statistics
calculate_statistics <- function(x) {
return(c(
Min = min(x, na.rm = TRUE),
Q1 = quantile(x, 0.25, na.rm = TRUE),
Median = median(x, na.rm = TRUE),
Mean = mean(x, na.rm = TRUE),
Q3 = quantile(x, 0.75, na.rm = TRUE),
Max = max(x, na.rm = TRUE)
))
}
The Histogram shows that the distribution of cnt in training dataset shows a bimodal shape which has two peaks, while the validation shows left skewed with only one peak.
par(mfrow = c(1, 2))
hist(training_data$cnt, main = "Training Set Histogram", xlab = "# bikes rented", col = "blue")
hist(validation_data$cnt, main = "Validation Set Histogram", xlab = "# bikes rented", col = "red")
par(mfrow = c(1, 1))
The histograms for registered users are similarly normally distributed, and both are slightly right skewed. The histogram of sqrt(casual) users for each data set shows similarity.
#for registered
par(mfrow = c(1, 2))
hist(training_data$registered, main = "Training Set Histogram", xlab = "registered users", col = "blue")
hist(validation_data$registered, main = "Validation Set Histogram", xlab = "registered users", col = "red")
par(mfrow = c(1, 1))
#for casual
par(mfrow = c(1, 2))
hist(sqrt(training_data$casual), main = "Training Set Histogram", xlab = "sqrt casual users", col = "blue")
hist(sqrt(validation_data$casual), main = "Validation Set Histogram", xlab = "sqrt casual users", col = "red")
par(mfrow = c(1, 1))
The categorical variables in the two data sets have similar counts and proportions.
#model1 <- lm(registered~as.factor(workingday) + as.factor(yr) + as.factor(weather))
#model2 <- lm(sqrt(casual)~as.factor(season) + as.factor(yr) + as.factor(weather) + as.factor(workingday))
#model3 <- lm(cnt ~ temp + wind + hum + as.factor(yr) + as.factor(workingday))
# WorkingDay
training_counts_workingday <- table(training_data$workingday)
validation_counts_workingday <- table(validation_data$workingday)
training_props_workingday <- prop.table(training_counts_workingday)
validation_props_workingday <- prop.table(validation_counts_workingday)
# Season
training_counts_season <- table(training_data$season)
validation_counts_season <- table(validation_data$season)
training_props_season <- prop.table(training_counts_season)
validation_props_season <- prop.table(validation_counts_season)
# Year
training_counts_yr <- table(training_data$yr)
validation_counts_yr <- table(validation_data$yr)
training_props_yr <- prop.table(training_counts_yr)
validation_props_yr <- prop.table(validation_counts_yr)
# Holiday
training_counts_holiday <- table(training_data$holiday)
validation_counts_holiday <- table(validation_data$holiday)
training_props_holiday <- prop.table(training_counts_holiday)
validation_props_holiday <- prop.table(validation_counts_holiday)
# Weathersit
training_counts_weathersit <- table(training_data$weathersit)
validation_counts_weathersit <- table(validation_data$weathersit)
training_props_weathersit <- prop.table(training_counts_weathersit)
validation_props_weathersit <- prop.table(validation_counts_weathersit)
# Printing out the tables
# For a better layout, you might want to print these one by one or use a custom layout with a package like gridExtra
#working day
cat("Training Set - Working Day Count and Proportion")
## Training Set - Working Day Count and Proportion
print(training_counts_workingday)
##
## Not_Workingday Workingday
## 115 250
print(training_props_workingday)
##
## Not_Workingday Workingday
## 0.3150685 0.6849315
cat("\nValidation Set - Working Day Count and Proportion")
##
## Validation Set - Working Day Count and Proportion
print(validation_counts_workingday)
##
## Not_Workingday Workingday
## 116 250
print(validation_props_workingday)
##
## Not_Workingday Workingday
## 0.3169399 0.6830601
cat("\n")
#season
cat("Training Set - Season Count and Proportion")
## Training Set - Season Count and Proportion
print(training_counts_season)
##
## Winter Spring Summer Fall
## 90 92 94 89
print(training_props_season)
##
## Winter Spring Summer Fall
## 0.2465753 0.2520548 0.2575342 0.2438356
cat("\nValidation Set - Season Count and Proportion")
##
## Validation Set - Season Count and Proportion
print(validation_counts_season)
##
## Winter Spring Summer Fall
## 91 92 94 89
print(validation_props_season)
##
## Winter Spring Summer Fall
## 0.2486339 0.2513661 0.2568306 0.2431694
cat("\n")
#year
cat("Training Set - Year Count and Proportion")
## Training Set - Year Count and Proportion
print(training_counts_yr)
##
## 0
## 365
print(training_props_yr)
##
## 0
## 1
cat("\nValidation Set - Year Count and Proportion")
##
## Validation Set - Year Count and Proportion
print(validation_counts_yr)
##
## 1
## 366
print(validation_props_yr)
##
## 1
## 1
cat("\n")
#holiday
cat("Training Set - Holiday Count and Proportion")
## Training Set - Holiday Count and Proportion
print(training_counts_holiday)
##
## 0 1
## 355 10
print(training_props_holiday)
##
## 0 1
## 0.97260274 0.02739726
cat("\nValidation Set - Holiday Count and Proportion")
##
## Validation Set - Holiday Count and Proportion
print(validation_counts_holiday)
##
## 0 1
## 355 11
print(validation_props_holiday)
##
## 0 1
## 0.96994536 0.03005464
cat("\n")
#weathersit
cat("Training Set - Weathersit Count and Proportion")
## Training Set - Weathersit Count and Proportion
print(training_counts_weathersit)
##
## Weather_1 Weather_2 Weather_3 Weather_4
## 226 124 15 0
print(training_props_weathersit)
##
## Weather_1 Weather_2 Weather_3 Weather_4
## 0.61917808 0.33972603 0.04109589 0.00000000
cat("\nValidation Set - Weathersit Count and Proportion")
##
## Validation Set - Weathersit Count and Proportion
print(validation_counts_weathersit)
##
## Weather_1 Weather_2 Weather_3 Weather_4
## 237 123 6 0
print(validation_props_weathersit)
##
## Weather_1 Weather_2 Weather_3 Weather_4
## 0.64754098 0.33606557 0.01639344 0.00000000
All four continuous variables are similarly distributed in the two data sets. In general, the temperature in 2012 is higher than in 2011, and the humidity is relatively lower and more widely spread in 2012 than that in 2011. Also, the wind speed is generally higher in 2012.
#For temp
par(mfrow = c(1, 2))
hist(training_data$temp, main = "Training Set Histogram", xlab = "temperature", col = "blue")
hist(validation_data$temp, main = "Validation Set Histogram", xlab = "temperature", col = "red")
par(mfrow = c(1, 1))
#For feeling temp
par(mfrow = c(1, 2))
hist(training_data$atemp, main = "Training Set Histogram", xlab = "feeling temperature", col = "blue")
hist(validation_data$atemp, main = "Validation Set Histogram", xlab = "feeling temperature", col = "red")
par(mfrow = c(1, 1))
#For hum
par(mfrow = c(1, 2))
hist(training_data$hum, main = "Training Set Histogram", xlab = "humidity", col = "blue")
hist(validation_data$hum, main = "Validation Set Histogram", xlab = "humidity", col = "red")
par(mfrow = c(1, 1))
#For windspeed
par(mfrow = c(1, 2))
hist(training_data$windspeed, main = "Training Set Histogram", xlab = "windspeed", col = "blue")
hist(validation_data$windspeed, main = "Validation Set Histogram", xlab = "windspeed", col = "red")
par(mfrow = c(1, 1))
We are fitting full model which will be used to stepwise regression later. We address multicollinearity issue and delete variables that might cause it.
#viki trail 1
full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model3 <- lm( cnt ~ casual +as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
summary(full_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1831.18 -249.18 45.54 306.01 1029.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 939.50 182.22 5.156 4.21e-07 ***
## as.factor(workingday)Workingday 698.55 55.36 12.617 < 2e-16 ***
## as.factor(season)Spring 812.16 92.94 8.738 < 2e-16 ***
## as.factor(season)Summer 794.67 121.99 6.514 2.52e-10 ***
## as.factor(season)Fall 1306.52 82.32 15.872 < 2e-16 ***
## holiday -196.65 156.92 -1.253 0.210974
## weekday 17.57 12.46 1.410 0.159397
## temp 2757.41 236.16 11.676 < 2e-16 ***
## hum -610.45 230.43 -2.649 0.008431 **
## windspeed -1469.23 351.94 -4.175 3.76e-05 ***
## as.factor(weathersit)Weather_2 -224.29 66.00 -3.398 0.000756 ***
## as.factor(weathersit)Weather_3 -1377.56 146.15 -9.426 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 471.3 on 353 degrees of freedom
## Multiple R-squared: 0.8083, Adjusted R-squared: 0.8024
## F-statistic: 135.3 on 11 and 353 DF, p-value: < 2.2e-16
summary(full_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0473 -3.0247 -0.0296 2.7154 18.3344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.89493 1.84453 10.786 < 2e-16 ***
## as.factor(workingday)Workingday -11.35502 0.56044 -20.261 < 2e-16 ***
## as.factor(season)Spring 6.26683 0.94083 6.661 1.05e-10 ***
## as.factor(season)Summer 3.36323 1.23484 2.724 0.00678 **
## as.factor(season)Fall 4.43365 0.83329 5.321 1.84e-07 ***
## holiday -2.56032 1.58848 -1.612 0.10790
## weekday 0.09252 0.12616 0.733 0.46381
## temp 32.15458 2.39057 13.451 < 2e-16 ***
## hum -6.06923 2.33258 -2.602 0.00966 **
## windspeed -14.22007 3.56256 -3.992 7.99e-05 ***
## as.factor(weathersit)Weather_2 -2.09773 0.66811 -3.140 0.00183 **
## as.factor(weathersit)Weather_3 -8.83239 1.47944 -5.970 5.78e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.771 on 353 degrees of freedom
## Multiple R-squared: 0.8019, Adjusted R-squared: 0.7957
## F-statistic: 129.9 on 11 and 353 DF, p-value: < 2.2e-16
summary(full_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2086.82 -250.88 43.02 316.95 994.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.822e+02 1.847e+02 3.694 0.000255 ***
## casual 1.393e+00 8.172e-02 17.048 < 2e-16 ***
## as.factor(workingday)Workingday 9.622e+02 7.673e+01 12.540 < 2e-16 ***
## as.factor(season)Spring 7.127e+02 9.250e+01 7.705 1.35e-13 ***
## as.factor(season)Summer 7.396e+02 1.189e+02 6.221 1.40e-09 ***
## as.factor(season)Fall 1.250e+03 8.071e+01 15.488 < 2e-16 ***
## holiday -1.457e+02 1.526e+02 -0.955 0.340318
## weekday 1.545e+01 1.210e+01 1.277 0.202431
## temp 2.167e+03 2.599e+02 8.338 1.72e-15 ***
## hum -4.957e+02 2.248e+02 -2.205 0.028098 *
## windspeed -1.132e+03 3.485e+02 -3.248 0.001273 **
## as.factor(weathersit)Weather_2 -1.881e+02 6.446e+01 -2.918 0.003754 **
## as.factor(weathersit)Weather_3 -1.256e+03 1.440e+02 -8.721 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.2 on 352 degrees of freedom
## Multiple R-squared: 0.8937, Adjusted R-squared: 0.8901
## F-statistic: 246.6 on 12 and 352 DF, p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)
avPlots(full_model2)
avPlots(full_model3)
stepwise_model1 <- step(full_model1, direction = "both")
## Start: AIC=4505.31
## registered ~ as.factor(workingday) + as.factor(season) + holiday +
## weekday + temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - holiday 1 348826 78757811 4504.9
## <none> 78408985 4505.3
## - weekday 1 441649 78850634 4505.4
## - hum 1 1558913 79967898 4510.5
## - windspeed 1 3871169 82280153 4520.9
## - as.factor(weathersit) 2 19753550 98162534 4583.3
## - temp 1 30282108 108691093 4622.5
## - as.factor(workingday) 1 35361497 113770481 4639.2
## - as.factor(season) 3 60396050 138805035 4707.8
##
## Step: AIC=4504.93
## registered ~ as.factor(workingday) + as.factor(season) + weekday +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 78757811 4504.9
## - weekday 1 509480 79267291 4505.3
## + holiday 1 348826 78408985 4505.3
## - hum 1 1478325 80236136 4509.7
## - windspeed 1 3852838 82610649 4520.4
## - as.factor(weathersit) 2 19849983 98607794 4583.0
## - temp 1 30118568 108876379 4621.1
## - as.factor(workingday) 1 39599907 118357718 4651.6
## - as.factor(season) 3 60285076 139042887 4706.4
stepwise_model2 <- step(full_model2, direction = "both")
## Start: AIC=1152.44
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday +
## weekday + temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - weekday 1 12.2 8046.8 1151.0
## <none> 8034.6 1152.4
## - holiday 1 59.1 8093.7 1153.1
## - hum 1 154.1 8188.7 1157.4
## - windspeed 1 362.6 8397.2 1166.5
## - as.factor(weathersit) 2 829.0 8863.6 1184.3
## - as.factor(season) 3 1441.3 9475.9 1206.7
## - temp 1 4117.9 12152.4 1301.5
## - as.factor(workingday) 1 9343.4 17378.0 1432.0
##
## Step: AIC=1151
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 8046.8 1151.0
## - holiday 1 63.9 8110.7 1151.9
## + weekday 1 12.2 8034.6 1152.4
## - hum 1 164.6 8211.5 1156.4
## - windspeed 1 359.7 8406.5 1165.0
## - as.factor(weathersit) 2 818.8 8865.6 1182.4
## - as.factor(season) 3 1445.3 9492.1 1205.3
## - temp 1 4107.9 12154.7 1299.5
## - as.factor(workingday) 1 9348.6 17395.5 1430.4
stepwise_model3 <- step(full_model3, direction = "both")
## Start: AIC=4484.06
## cnt ~ casual + as.factor(workingday) + as.factor(season) + holiday +
## weekday + temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - holiday 1 190554 73761582 4483.0
## - weekday 1 340857 73911885 4483.7
## <none> 73571028 4484.1
## - hum 1 1016235 74587264 4487.1
## - windspeed 1 2205158 75776187 4492.8
## - temp 1 14531329 88102358 4547.9
## - as.factor(weathersit) 2 15979331 89550360 4551.8
## - as.factor(workingday) 1 32868919 106439948 4616.9
## - as.factor(season) 3 54301778 127872806 4679.8
## - casual 1 60747394 134318422 4701.8
##
## Step: AIC=4483
## cnt ~ casual + as.factor(workingday) + as.factor(season) + weekday +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - weekday 1 383340 74144922 4482.9
## <none> 73761582 4483.0
## + holiday 1 190554 73571028 4484.1
## - hum 1 963292 74724874 4485.7
## - windspeed 1 2178046 75939628 4491.6
## - temp 1 14386687 88148269 4546.0
## - as.factor(weathersit) 2 15987090 89748672 4550.6
## - as.factor(workingday) 1 35790437 109552019 4625.4
## - as.factor(season) 3 54174909 127936491 4678.0
## - casual 1 61516967 135278549 4702.4
##
## Step: AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 74144922 4482.9
## + weekday 1 383340 73761582 4483.0
## + holiday 1 233037 73911885 4483.7
## - hum 1 1087039 75231961 4486.2
## - windspeed 1 2121892 76266814 4491.2
## - temp 1 14190777 88335699 4544.8
## - as.factor(weathersit) 2 15737193 89882115 4549.2
## - as.factor(workingday) 1 36139539 110284461 4625.8
## - as.factor(season) 3 54380050 128524972 4677.7
## - casual 1 62035815 136180737 4702.8
summary(stepwise_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## weekday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1834.15 -238.70 46.37 304.93 1029.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 911.40 180.98 5.036 7.59e-07 ***
## as.factor(workingday)Workingday 715.86 53.66 13.341 < 2e-16 ***
## as.factor(season)Spring 815.27 92.98 8.768 < 2e-16 ***
## as.factor(season)Summer 798.73 122.04 6.545 2.09e-10 ***
## as.factor(season)Fall 1305.98 82.38 15.853 < 2e-16 ***
## weekday 18.82 12.43 1.513 0.13110
## temp 2748.78 236.25 11.635 < 2e-16 ***
## hum -593.43 230.21 -2.578 0.01035 *
## windspeed -1465.70 352.21 -4.161 3.97e-05 ***
## as.factor(weathersit)Weather_2 -229.81 65.91 -3.487 0.00055 ***
## as.factor(weathersit)Weather_3 -1381.03 146.24 -9.444 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 471.7 on 354 degrees of freedom
## Multiple R-squared: 0.8075, Adjusted R-squared: 0.802
## F-statistic: 148.5 on 10 and 354 DF, p-value: < 2.2e-16
summary(stepwise_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## holiday + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.7811 -2.9288 -0.0188 2.7108 18.2147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.2681 1.7718 11.439 < 2e-16 ***
## as.factor(workingday)Workingday -11.3579 0.5601 -20.280 < 2e-16 ***
## as.factor(season)Spring 6.2838 0.9399 6.685 8.99e-11 ***
## as.factor(season)Summer 3.3963 1.2332 2.754 0.00619 **
## as.factor(season)Fall 4.4543 0.8323 5.352 1.57e-07 ***
## holiday -2.6529 1.5824 -1.677 0.09452 .
## temp 32.1003 2.3879 13.443 < 2e-16 ***
## hum -6.2416 2.3192 -2.691 0.00746 **
## windspeed -14.1586 3.5593 -3.978 8.43e-05 ***
## as.factor(weathersit)Weather_2 -2.0546 0.6651 -3.089 0.00217 **
## as.factor(weathersit)Weather_3 -8.7637 1.4755 -5.939 6.84e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.768 on 354 degrees of freedom
## Multiple R-squared: 0.8016, Adjusted R-squared: 0.796
## F-statistic: 143 on 10 and 354 DF, p-value: < 2.2e-16
summary(stepwise_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2060.85 -257.74 44.82 332.76 1011.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.189e+02 1.775e+02 4.050 6.29e-05 ***
## casual 1.403e+00 8.153e-02 17.210 < 2e-16 ***
## as.factor(workingday)Workingday 9.826e+02 7.480e+01 13.136 < 2e-16 ***
## as.factor(season)Spring 7.157e+02 9.258e+01 7.730 1.12e-13 ***
## as.factor(season)Summer 7.474e+02 1.189e+02 6.285 9.65e-10 ***
## as.factor(season)Fall 1.252e+03 8.076e+01 15.501 < 2e-16 ***
## temp 2.135e+03 2.594e+02 8.231 3.60e-15 ***
## hum -5.094e+02 2.236e+02 -2.278 0.02331 *
## windspeed -1.110e+03 3.486e+02 -3.183 0.00159 **
## as.factor(weathersit)Weather_2 -1.840e+02 6.418e+01 -2.867 0.00439 **
## as.factor(weathersit)Weather_3 -1.243e+03 1.438e+02 -8.645 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared: 0.8928, Adjusted R-squared: 0.8898
## F-statistic: 295 on 10 and 354 DF, p-value: < 2.2e-16
avPlots(stepwise_model1)
avPlots(stepwise_model2)
avPlots(stepwise_model3)
For vif, values close to 1 indicate no multicollinearity, values between 1-3 indicate mild collinearity which is not ideal but can be considered ok since it won’t severaly impact the interpretability of the coefficients, values above indicate severe multicollinearity which is considered unreliable.
vif(stepwise_model1)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019319 1 1.009613
## as.factor(season) 3.639198 3 1.240226
## weekday 1.017995 1 1.008957
## temp 3.282494 1 1.811766
## hum 1.918458 1 1.385084
## windspeed 1.199915 1 1.095406
## as.factor(weathersit) 1.834088 2 1.163737
vif(stepwise_model2)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.086910 1 1.042550
## as.factor(season) 3.640064 3 1.240276
## holiday 1.071406 1 1.035087
## temp 3.282132 1 1.811666
## hum 1.905606 1 1.380437
## windspeed 1.199328 1 1.095138
## as.factor(weathersit) 1.826326 2 1.162504
vif(stepwise_model3)
## GVIF Df GVIF^(1/(2*Df))
## casual 3.575021 1 1.890773
## as.factor(workingday) 2.104307 1 1.450623
## as.factor(season) 3.879645 3 1.253522
## temp 4.203539 1 2.050253
## hum 1.922350 1 1.386488
## windspeed 1.248688 1 1.117447
## as.factor(weathersit) 1.881977 2 1.171261
validation_data$predicted_registered <- predict(stepwise_model1, newdata = validation_data)
validation_data$predicted_sqrtcasual <- predict(stepwise_model2, newdata = validation_data)
validation_data$predicted_count <- predict(stepwise_model3, newdata = validation_data)
observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$predicted_registered
# Calculate different prediction performance metrics
# Functions from the MLmetrics package
# Common regression metrics
# Calculate the Root Mean Squared Error (RMSE), which measures the ...
# ... average magnitude of prediction errors.
# Lower is better.
rmse1 <- RMSE(predicted_values1, observed_values1)
# same as
# sqrt(mean((observed_values - predicted_values)^2))
# Compute the Mean Absolute Error (MAE), indicating the average absolute ...
# ... difference between predicted and observed values.
# Lower is better.
mae1 <- MAE(predicted_values1, observed_values1)
# same as
# mean(abs(observed_values - predicted_values))
# Calculate the Mean Absolute Percentage Error (MAPE), measuring the ...
# ... average percentage difference between predicted and observed values.
mape1 <- MAPE(predicted_values1, observed_values1)
# same as
# mean(abs(predicted_values-observed_values)/observed_values)
# Determine the R-squared (R²) Score, representing the proportion of the ...
# ... variance in the observed values (of validation data set) ...
# ... explained by the predicted values from the model.
# Higher is better.
r_squared1 <- R2_Score(predicted_values1, observed_values1)
# same as
# summary(lm(observed_values ~ predicted_values))$r.squared
# Display the calculated metrics
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1929.403
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1799.901
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.84
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6342
#second question
observed_values2 <- sqrt(validation_data$casual) #should I transform it here??
predicted_values2 <- validation_data$predicted_sqrtcasual
rmse2 <- RMSE(predicted_values2, observed_values2)
mae2 <- MAE(predicted_values2, observed_values2)
mape2 <- MAPE(predicted_values2, observed_values2)
r_squared2 <- R2_Score(predicted_values2, observed_values2)
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.5939
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9561
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.5865
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2126
#third question
observed_values3 <- validation_data$count
predicted_values3 <- validation_data$predicted_count
rmse3 <- RMSE(predicted_values3, observed_values3)
mae3 <- MAE(predicted_values3, observed_values3)
mape3 <- MAPE(predicted_values3, observed_values3)
r_squared3 <- R2_Score(predicted_values3, observed_values3)
## Warning in mean.default(y_true): argument is not numeric or logical: returning
## NA
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): NaN
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): NaN
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: NaN
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): NaN
we choose model 1 because it has best RMSE for validation
#stepwise regression on model 1
m.mlr <- lm(registered ~ as.factor(workingday) + as.factor(season) +
temp + windspeed + as.factor(weathersit),
data = training_data)
#original regression question 1
m.mlr1 <- lm(registered ~ as.factor(workingday) + as.factor(weathersit),
data = training_data)
#question2
m.mlr2 <- lm(sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday +
temp + hum + windspeed + as.factor(weathersit),
data = training_data)
diagnostics_df <- data.frame(
Residuals = resid(m.mlr),
Fitted_Values = fitted(m.mlr),
Standardized_Residuals = rstandard(m.mlr),
Leverage = hatvalues(m.mlr)
#Date = year2011$dteday
)
# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = Residuals)) +
geom_point(col="blue", alpha=0.75) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "Residuals") +
theme_bw()
# Create the QQ plot
ggplot(diagnostics_df, aes(sample = Standardized_Residuals)) +
stat_qq(aes(sample = Standardized_Residuals), distribution = qnorm,
size = 2, col="blue", alpha = 0.75) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Standardized Residual QQ Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_bw()
# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = sqrt(abs(Standardized_Residuals)))) +
geom_point(col="blue", alpha=0.75) +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
theme_bw()
# Leverage vs Standardized Residuals
ggplot(diagnostics_df, aes(x = Leverage, y = Standardized_Residuals)) +
geom_point(alpha = 0.75) +
labs(title = "Standardized Residuals vs. Leverage Plot",
x = "Leverage", y = "Standardized Residuals") +
theme_bw()
validation2012<- subset(validation_data)
validation2012$predicted_registered <- predict(m.mlr, newdata = validation2012)
summary(validation2012$predicted_registered)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -673.6 2234.2 3031.8 2812.5 3492.8 4182.6
# Extract observed and predicted values
observed_values <- validation2012$registered
predicted_values <- validation2012$predicted_registered
rmse <- RMSE(predicted_values, observed_values)
mae <- MAE(predicted_values, observed_values)
mape <- MAPE(predicted_values, observed_values)
r_squared <- R2_Score(predicted_values, observed_values)
cat("Root Mean Squared Error (RMSE):", round(rmse, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1949.03
cat("Mean Absolute Error (MAE):", round(mae, digits = 4), "\n")
## Mean Absolute Error (MAE): 1815.744
cat("R-squared (R^2) Score:", round(r_squared, digits = 4), "\n")
## R-squared (R^2) Score: -0.8776
cat("Mean Absolute Percentage Error (MPE):", round(mape, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6476
ggplot(validation2012, aes(x = observed_values, y = predicted_values)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(x = "Observed Values", y = "Predicted Values",
title = "Observed vs. Predicted Values") +
theme_bw()
# Residuals plot
ggplot(validation2012, aes(x = 1:nrow(validation2012), y = observed_values-predicted_values)) +
geom_point() +
geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
labs(x = "Observation Index", y = "Residuals",
title = "Observed vs. Predicted Values") +
theme_bw()
#kaylee trial 1
full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model3 <- lm( cnt ~ casual+as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
summary(full_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed +
## as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1779.75 -237.37 62.58 308.19 1085.42
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 930.21 184.49 5.042 7.42e-07 ***
## as.factor(workingday)Workingday 746.78 93.56 7.982 2.10e-14 ***
## as.factor(season)Spring 816.26 93.31 8.747 < 2e-16 ***
## as.factor(season)Summer 799.01 122.44 6.526 2.39e-10 ***
## as.factor(season)Fall 1309.72 82.70 15.838 < 2e-16 ***
## as.factor(holiday)1 -114.41 178.98 -0.639 0.523072
## as.factor(weekday)1 -49.07 94.39 -0.520 0.603455
## as.factor(weekday)2 58.64 93.37 0.628 0.530438
## as.factor(weekday)3 41.91 94.55 0.443 0.657891
## as.factor(weekday)4 49.53 93.32 0.531 0.595893
## as.factor(weekday)5 NA NA NA NA
## as.factor(weekday)6 136.82 92.73 1.476 0.140967
## temp 2744.64 237.11 11.575 < 2e-16 ***
## hum -610.19 232.17 -2.628 0.008963 **
## windspeed -1480.21 353.67 -4.185 3.61e-05 ***
## as.factor(weathersit)Weather_2 -227.82 66.36 -3.433 0.000669 ***
## as.factor(weathersit)Weather_3 -1401.20 148.05 -9.465 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 472.7 on 349 degrees of freedom
## Multiple R-squared: 0.8094, Adjusted R-squared: 0.8012
## F-statistic: 98.77 on 15 and 349 DF, p-value: < 2.2e-16
summary(full_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed +
## as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0744 -2.6269 0.0553 2.7719 18.2465
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.8250 1.8394 10.778 < 2e-16 ***
## as.factor(workingday)Workingday -9.4840 0.9328 -10.168 < 2e-16 ***
## as.factor(season)Spring 6.1465 0.9304 6.607 1.47e-10 ***
## as.factor(season)Summer 3.2409 1.2208 2.655 0.008298 **
## as.factor(season)Fall 4.3307 0.8245 5.253 2.61e-07 ***
## as.factor(holiday)1 -1.5567 1.7844 -0.872 0.383605
## as.factor(weekday)1 -1.0232 0.9411 -1.087 0.277666
## as.factor(weekday)2 -2.2992 0.9309 -2.470 0.013998 *
## as.factor(weekday)3 -2.8697 0.9427 -3.044 0.002511 **
## as.factor(weekday)4 -2.6316 0.9304 -2.829 0.004946 **
## as.factor(weekday)5 NA NA NA NA
## as.factor(weekday)6 0.2164 0.9245 0.234 0.815070
## temp 32.5149 2.3640 13.754 < 2e-16 ***
## hum -6.0657 2.3148 -2.620 0.009165 **
## windspeed -13.6013 3.5261 -3.857 0.000137 ***
## as.factor(weathersit)Weather_2 -2.0200 0.6616 -3.053 0.002439 **
## as.factor(weathersit)Weather_3 -8.1185 1.4760 -5.500 7.35e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.713 on 349 degrees of freedom
## Multiple R-squared: 0.8088, Adjusted R-squared: 0.8006
## F-statistic: 98.44 on 15 and 349 DF, p-value: < 2.2e-16
summary(full_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed +
## as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2023.28 -241.28 44.38 312.36 992.46
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.626e+02 1.863e+02 3.557 0.000427 ***
## casual 1.413e+00 8.248e-02 17.133 < 2e-16 ***
## as.factor(workingday)Workingday 9.866e+02 1.024e+02 9.637 < 2e-16 ***
## as.factor(season)Spring 7.140e+02 9.253e+01 7.716 1.28e-13 ***
## as.factor(season)Summer 7.434e+02 1.189e+02 6.250 1.20e-09 ***
## as.factor(season)Fall 1.252e+03 8.080e+01 15.499 < 2e-16 ***
## as.factor(holiday)1 -8.103e+01 1.732e+02 -0.468 0.640274
## as.factor(weekday)1 -3.137e+01 9.136e+01 -0.343 0.731555
## as.factor(weekday)2 9.987e+01 9.068e+01 1.101 0.271516
## as.factor(weekday)3 9.560e+01 9.208e+01 1.038 0.299872
## as.factor(weekday)4 9.781e+01 9.077e+01 1.078 0.281978
## as.factor(weekday)5 NA NA NA NA
## as.factor(weekday)6 1.269e+02 8.970e+01 1.415 0.157935
## temp 2.118e+03 2.613e+02 8.106 9.01e-15 ***
## hum -4.896e+02 2.258e+02 -2.168 0.030838 *
## windspeed -1.137e+03 3.488e+02 -3.261 0.001221 **
## as.factor(weathersit)Weather_2 -1.913e+02 6.460e+01 -2.961 0.003278 **
## as.factor(weathersit)Weather_3 -1.286e+03 1.450e+02 -8.872 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.2 on 348 degrees of freedom
## Multiple R-squared: 0.8949, Adjusted R-squared: 0.89
## F-statistic: 185.1 on 16 and 348 DF, p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
avPlots(full_model2)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
avPlots(full_model3)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
Compared with Viki’s first trial, the AIC for model2 improved as well.(1151 to 1147)
stepwise_model1 <- step(full_model1, direction = "both")
## Start: AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(holiday) +
## as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
##
##
## Step: AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(weekday) +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - as.factor(weekday) 6 1278049 79267291 4505.3
## <none> 77989242 4511.3
## - hum 1 1543574 79532816 4516.5
## - windspeed 1 3914363 81903605 4527.2
## - as.factor(workingday) 1 6617374 84606616 4539.1
## - as.factor(weathersit) 2 20032722 98021964 4590.8
## - temp 1 29941742 107930985 4627.9
## - as.factor(season) 3 60480699 138469941 4714.9
##
## Step: AIC=4505.28
## registered ~ as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 79267291 4505.3
## + as.factor(holiday) 1 416658 78850634 4505.4
## - hum 1 1666159 80933450 4510.9
## + as.factor(weekday) 6 1278049 77989242 4511.3
## - windspeed 1 3787445 83054736 4520.3
## - as.factor(weathersit) 2 19539168 98806460 4581.7
## - temp 1 29890751 109158042 4620.1
## - as.factor(workingday) 1 39726359 118993650 4651.6
## - as.factor(season) 3 60612796 139880087 4706.6
stepwise_model2 <- step(full_model2, direction = "both")
## Start: AIC=1147.39
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + as.factor(holiday) +
## as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
##
##
## Step: AIC=1147.39
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + as.factor(weekday) +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 7752.3 1147.4
## - as.factor(weekday) 6 358.4 8110.7 1151.9
## - hum 1 152.5 7904.9 1152.5
## - windspeed 1 330.5 8082.8 1160.6
## - as.factor(workingday) 1 560.7 8313.0 1170.9
## - as.factor(weathersit) 2 694.1 8446.4 1174.7
## - as.factor(season) 3 1393.8 9146.2 1201.7
## - temp 1 4202.1 11954.5 1303.5
stepwise_model3 <- step(full_model3, direction = "both")
## Start: AIC=4487.94
## cnt ~ casual + as.factor(workingday) + as.factor(season) + as.factor(holiday) +
## as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
##
##
## Step: AIC=4487.94
## cnt ~ casual + as.factor(workingday) + as.factor(season) + as.factor(weekday) +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - as.factor(weekday) 6 1399230 74144922 4482.9
## <none> 72745692 4487.9
## - hum 1 982509 73728201 4490.8
## - windspeed 1 2222597 74968289 4496.9
## - as.factor(workingday) 1 9482947 82228639 4530.7
## - temp 1 13734596 86480288 4549.1
## - as.factor(weathersit) 2 16535261 89280953 4558.7
## - as.factor(season) 3 54344672 127090364 4685.6
## - casual 1 61362707 134108399 4709.2
##
## Step: AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 74144922 4482.9
## + as.factor(holiday) 1 233037 73911885 4483.7
## - hum 1 1087039 75231961 4486.2
## + as.factor(weekday) 6 1399230 72745692 4487.9
## - windspeed 1 2121892 76266814 4491.2
## - temp 1 14190777 88335699 4544.8
## - as.factor(weathersit) 2 15737193 89882115 4549.2
## - as.factor(workingday) 1 36139539 110284461 4625.8
## - as.factor(season) 3 54380050 128524972 4677.7
## - casual 1 62035815 136180737 4702.8
summary(stepwise_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1794.6 -246.9 47.2 311.4 1056.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 985.06 174.62 5.641 3.46e-08 ***
## as.factor(workingday)Workingday 716.94 53.75 13.338 < 2e-16 ***
## as.factor(season)Spring 819.04 93.12 8.796 < 2e-16 ***
## as.factor(season)Summer 805.89 122.17 6.596 1.53e-10 ***
## as.factor(season)Fall 1310.16 82.49 15.883 < 2e-16 ***
## temp 2736.84 236.54 11.570 < 2e-16 ***
## hum -627.06 229.55 -2.732 0.006617 **
## windspeed -1452.78 352.74 -4.119 4.75e-05 ***
## as.factor(weathersit)Weather_2 -221.52 65.80 -3.367 0.000844 ***
## as.factor(weathersit)Weather_3 -1367.32 146.23 -9.351 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 472.5 on 355 degrees of freedom
## Multiple R-squared: 0.8062, Adjusted R-squared: 0.8013
## F-statistic: 164.1 on 9 and 355 DF, p-value: < 2.2e-16
summary(stepwise_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0744 -2.6269 0.0553 2.7719 18.2465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.8250 1.8394 10.778 < 2e-16 ***
## as.factor(workingday)Workingday -7.9273 1.5778 -5.024 8.09e-07 ***
## as.factor(season)Spring 6.1465 0.9304 6.607 1.47e-10 ***
## as.factor(season)Summer 3.2409 1.2208 2.655 0.008298 **
## as.factor(season)Fall 4.3307 0.8245 5.253 2.61e-07 ***
## as.factor(weekday)1 -2.5799 1.6509 -1.563 0.119019
## as.factor(weekday)2 -3.8559 1.8350 -2.101 0.036332 *
## as.factor(weekday)3 -4.4264 1.8368 -2.410 0.016478 *
## as.factor(weekday)4 -4.1883 1.8147 -2.308 0.021585 *
## as.factor(weekday)5 -1.5567 1.7844 -0.872 0.383605
## as.factor(weekday)6 0.2164 0.9245 0.234 0.815070
## temp 32.5149 2.3640 13.754 < 2e-16 ***
## hum -6.0657 2.3148 -2.620 0.009165 **
## windspeed -13.6013 3.5261 -3.857 0.000137 ***
## as.factor(weathersit)Weather_2 -2.0200 0.6616 -3.053 0.002439 **
## as.factor(weathersit)Weather_3 -8.1185 1.4760 -5.500 7.35e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.713 on 349 degrees of freedom
## Multiple R-squared: 0.8088, Adjusted R-squared: 0.8006
## F-statistic: 98.44 on 15 and 349 DF, p-value: < 2.2e-16
summary(stepwise_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2060.85 -257.74 44.82 332.76 1011.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.189e+02 1.775e+02 4.050 6.29e-05 ***
## casual 1.403e+00 8.153e-02 17.210 < 2e-16 ***
## as.factor(workingday)Workingday 9.826e+02 7.480e+01 13.136 < 2e-16 ***
## as.factor(season)Spring 7.157e+02 9.258e+01 7.730 1.12e-13 ***
## as.factor(season)Summer 7.474e+02 1.189e+02 6.285 9.65e-10 ***
## as.factor(season)Fall 1.252e+03 8.076e+01 15.501 < 2e-16 ***
## temp 2.135e+03 2.594e+02 8.231 3.60e-15 ***
## hum -5.094e+02 2.236e+02 -2.278 0.02331 *
## windspeed -1.110e+03 3.486e+02 -3.183 0.00159 **
## as.factor(weathersit)Weather_2 -1.840e+02 6.418e+01 -2.867 0.00439 **
## as.factor(weathersit)Weather_3 -1.243e+03 1.438e+02 -8.645 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared: 0.8928, Adjusted R-squared: 0.8898
## F-statistic: 295 on 10 and 354 DF, p-value: < 2.2e-16
avPlots(stepwise_model1)
avPlots(stepwise_model2)
avPlots(stepwise_model3)
vif(stepwise_model1)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019139 1 1.009524
## as.factor(season) 3.632998 3 1.239874
## temp 3.278830 1 1.810754
## hum 1.900579 1 1.378615
## windspeed 1.199210 1 1.095085
## as.factor(weathersit) 1.819527 2 1.161421
vif(stepwise_model2)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 8.827923 1 2.971182
## as.factor(season) 3.659745 3 1.241391
## as.factor(weekday) 9.484996 6 1.206201
## temp 3.291928 1 1.814367
## hum 1.942591 1 1.393769
## windspeed 1.204556 1 1.097523
## as.factor(weathersit) 1.891041 2 1.172668
vif(stepwise_model3)
## GVIF Df GVIF^(1/(2*Df))
## casual 3.575021 1 1.890773
## as.factor(workingday) 2.104307 1 1.450623
## as.factor(season) 3.879645 3 1.253522
## temp 4.203539 1 2.050253
## hum 1.922350 1 1.386488
## windspeed 1.248688 1 1.117447
## as.factor(weathersit) 1.881977 2 1.171261
The above vif is the result after doing following changes: Because of high multicollinearity of workingday and weekday in model2, we will remove one of them from model2. We also remove sqrt from sqrt(casual) in model 3, which changes back to casual, the multicollinearity reduces because GVIF of casual goes down from 4.88 to 3.57, but the R-squared for model3 reduces from 90% to 88%, which is good because effect of overfitting reduces.
#kaylee trial 2
full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ temp + hum + windspeed + as.factor(weathersit),
data = training_data)
full_model3 <- lm( cnt ~ casual+as.factor(workingday) + as.factor(season)+ as.factor(holiday) + temp + hum + windspeed + as.factor(weathersit),
data = training_data)
summary(full_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## as.factor(holiday) + as.factor(weekday) + temp + hum + windspeed +
## as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1779.75 -237.37 62.58 308.19 1085.42
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 930.21 184.49 5.042 7.42e-07 ***
## as.factor(workingday)Workingday 746.78 93.56 7.982 2.10e-14 ***
## as.factor(season)Spring 816.26 93.31 8.747 < 2e-16 ***
## as.factor(season)Summer 799.01 122.44 6.526 2.39e-10 ***
## as.factor(season)Fall 1309.72 82.70 15.838 < 2e-16 ***
## as.factor(holiday)1 -114.41 178.98 -0.639 0.523072
## as.factor(weekday)1 -49.07 94.39 -0.520 0.603455
## as.factor(weekday)2 58.64 93.37 0.628 0.530438
## as.factor(weekday)3 41.91 94.55 0.443 0.657891
## as.factor(weekday)4 49.53 93.32 0.531 0.595893
## as.factor(weekday)5 NA NA NA NA
## as.factor(weekday)6 136.82 92.73 1.476 0.140967
## temp 2744.64 237.11 11.575 < 2e-16 ***
## hum -610.19 232.17 -2.628 0.008963 **
## windspeed -1480.21 353.67 -4.185 3.61e-05 ***
## as.factor(weathersit)Weather_2 -227.82 66.36 -3.433 0.000669 ***
## as.factor(weathersit)Weather_3 -1401.20 148.05 -9.465 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 472.7 on 349 degrees of freedom
## Multiple R-squared: 0.8094, Adjusted R-squared: 0.8012
## F-statistic: 98.77 on 15 and 349 DF, p-value: < 2.2e-16
summary(full_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5673 -2.9959 -0.0408 2.7551 15.8412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.9546 1.7664 11.297 < 2e-16 ***
## as.factor(workingday)Workingday -11.1234 0.5437 -20.459 < 2e-16 ***
## as.factor(season)Spring 6.3291 0.9419 6.719 7.29e-11 ***
## as.factor(season)Summer 3.4574 1.2358 2.798 0.00543 **
## as.factor(season)Fall 4.4509 0.8344 5.334 1.71e-07 ***
## temp 31.9733 2.3927 13.363 < 2e-16 ***
## hum -6.0419 2.3220 -2.602 0.00966 **
## windspeed -14.0995 3.5681 -3.951 9.37e-05 ***
## as.factor(weathersit)Weather_2 -2.1217 0.6656 -3.188 0.00156 **
## as.factor(weathersit)Weather_3 -8.7984 1.4791 -5.948 6.49e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.78 on 355 degrees of freedom
## Multiple R-squared: 0.8, Adjusted R-squared: 0.7949
## F-statistic: 157.8 on 9 and 355 DF, p-value: < 2.2e-16
summary(full_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## as.factor(holiday) + temp + hum + windspeed + as.factor(weathersit),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2056.46 -261.73 31.05 332.27 1012.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.420e+02 1.788e+02 4.150 4.18e-05 ***
## casual 1.397e+00 8.174e-02 17.091 < 2e-16 ***
## as.factor(workingday)Workingday 9.643e+02 7.678e+01 12.559 < 2e-16 ***
## as.factor(season)Spring 7.145e+02 9.257e+01 7.719 1.22e-13 ***
## as.factor(season)Summer 7.446e+02 1.189e+02 6.261 1.11e-09 ***
## as.factor(season)Fall 1.253e+03 8.075e+01 15.516 < 2e-16 ***
## as.factor(holiday)1 -1.606e+02 1.523e+02 -1.055 0.29216
## temp 2.152e+03 2.599e+02 8.282 2.54e-15 ***
## hum -5.233e+02 2.239e+02 -2.337 0.02001 *
## windspeed -1.119e+03 3.487e+02 -3.208 0.00146 **
## as.factor(weathersit)Weather_2 -1.805e+02 6.425e+01 -2.810 0.00523 **
## as.factor(weathersit)Weather_3 -1.243e+03 1.438e+02 -8.646 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.6 on 353 degrees of freedom
## Multiple R-squared: 0.8932, Adjusted R-squared: 0.8899
## F-statistic: 268.3 on 11 and 353 DF, p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
## Warning in lsfit(mod.mat[, -var], cbind(mod.mat[, var], response), wt = wt, :
## 'X' matrix was collinear
avPlots(full_model2)
avPlots(full_model3)
#kaylee
stepwise_model1 <- step(full_model1, direction = "both")
## Start: AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(holiday) +
## as.factor(weekday) + temp + hum + windspeed + as.factor(weathersit)
##
##
## Step: AIC=4511.35
## registered ~ as.factor(workingday) + as.factor(season) + as.factor(weekday) +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - as.factor(weekday) 6 1278049 79267291 4505.3
## <none> 77989242 4511.3
## - hum 1 1543574 79532816 4516.5
## - windspeed 1 3914363 81903605 4527.2
## - as.factor(workingday) 1 6617374 84606616 4539.1
## - as.factor(weathersit) 2 20032722 98021964 4590.8
## - temp 1 29941742 107930985 4627.9
## - as.factor(season) 3 60480699 138469941 4714.9
##
## Step: AIC=4505.28
## registered ~ as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 79267291 4505.3
## + as.factor(holiday) 1 416658 78850634 4505.4
## - hum 1 1666159 80933450 4510.9
## + as.factor(weekday) 6 1278049 77989242 4511.3
## - windspeed 1 3787445 83054736 4520.3
## - as.factor(weathersit) 2 19539168 98806460 4581.7
## - temp 1 29890751 109158042 4620.1
## - as.factor(workingday) 1 39726359 118993650 4651.6
## - as.factor(season) 3 60612796 139880087 4706.6
stepwise_model2 <- step(full_model2, direction = "both")
## Start: AIC=1151.88
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 8110.7 1151.9
## - hum 1 154.7 8265.4 1156.8
## - windspeed 1 356.7 8467.5 1165.6
## - as.factor(weathersit) 2 829.7 8940.4 1183.4
## - as.factor(season) 3 1450.8 9561.5 1206.0
## - temp 1 4079.6 12190.3 1298.6
## - as.factor(workingday) 1 9562.9 17673.6 1434.2
stepwise_model3 <- step(full_model3, direction = "both")
## Start: AIC=4483.75
## cnt ~ casual + as.factor(workingday) + as.factor(season) + as.factor(holiday) +
## temp + hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## - as.factor(holiday) 1 233037 74144922 4482.9
## <none> 73911885 4483.7
## - hum 1 1143264 75055150 4487.4
## - windspeed 1 2154800 76066685 4492.2
## - temp 1 14362298 88274183 4546.6
## - as.factor(weathersit) 2 15748723 89660608 4550.3
## - as.factor(workingday) 1 33025598 106937483 4616.6
## - as.factor(season) 3 54516542 128428427 4679.4
## - casual 1 61161580 135073465 4701.8
##
## Step: AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp +
## hum + windspeed + as.factor(weathersit)
##
## Df Sum of Sq RSS AIC
## <none> 74144922 4482.9
## + as.factor(holiday) 1 233037 73911885 4483.7
## - hum 1 1087039 75231961 4486.2
## - windspeed 1 2121892 76266814 4491.2
## - temp 1 14190777 88335699 4544.8
## - as.factor(weathersit) 2 15737193 89882115 4549.2
## - as.factor(workingday) 1 36139539 110284461 4625.8
## - as.factor(season) 3 54380050 128524972 4677.7
## - casual 1 62035815 136180737 4702.8
summary(stepwise_model1)
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1794.6 -246.9 47.2 311.4 1056.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 985.06 174.62 5.641 3.46e-08 ***
## as.factor(workingday)Workingday 716.94 53.75 13.338 < 2e-16 ***
## as.factor(season)Spring 819.04 93.12 8.796 < 2e-16 ***
## as.factor(season)Summer 805.89 122.17 6.596 1.53e-10 ***
## as.factor(season)Fall 1310.16 82.49 15.883 < 2e-16 ***
## temp 2736.84 236.54 11.570 < 2e-16 ***
## hum -627.06 229.55 -2.732 0.006617 **
## windspeed -1452.78 352.74 -4.119 4.75e-05 ***
## as.factor(weathersit)Weather_2 -221.52 65.80 -3.367 0.000844 ***
## as.factor(weathersit)Weather_3 -1367.32 146.23 -9.351 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 472.5 on 355 degrees of freedom
## Multiple R-squared: 0.8062, Adjusted R-squared: 0.8013
## F-statistic: 164.1 on 9 and 355 DF, p-value: < 2.2e-16
summary(stepwise_model2)
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5673 -2.9959 -0.0408 2.7551 15.8412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.9546 1.7664 11.297 < 2e-16 ***
## as.factor(workingday)Workingday -11.1234 0.5437 -20.459 < 2e-16 ***
## as.factor(season)Spring 6.3291 0.9419 6.719 7.29e-11 ***
## as.factor(season)Summer 3.4574 1.2358 2.798 0.00543 **
## as.factor(season)Fall 4.4509 0.8344 5.334 1.71e-07 ***
## temp 31.9733 2.3927 13.363 < 2e-16 ***
## hum -6.0419 2.3220 -2.602 0.00966 **
## windspeed -14.0995 3.5681 -3.951 9.37e-05 ***
## as.factor(weathersit)Weather_2 -2.1217 0.6656 -3.188 0.00156 **
## as.factor(weathersit)Weather_3 -8.7984 1.4791 -5.948 6.49e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.78 on 355 degrees of freedom
## Multiple R-squared: 0.8, Adjusted R-squared: 0.7949
## F-statistic: 157.8 on 9 and 355 DF, p-value: < 2.2e-16
summary(stepwise_model3)
##
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) +
## temp + hum + windspeed + as.factor(weathersit), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2060.85 -257.74 44.82 332.76 1011.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.189e+02 1.775e+02 4.050 6.29e-05 ***
## casual 1.403e+00 8.153e-02 17.210 < 2e-16 ***
## as.factor(workingday)Workingday 9.826e+02 7.480e+01 13.136 < 2e-16 ***
## as.factor(season)Spring 7.157e+02 9.258e+01 7.730 1.12e-13 ***
## as.factor(season)Summer 7.474e+02 1.189e+02 6.285 9.65e-10 ***
## as.factor(season)Fall 1.252e+03 8.076e+01 15.501 < 2e-16 ***
## temp 2.135e+03 2.594e+02 8.231 3.60e-15 ***
## hum -5.094e+02 2.236e+02 -2.278 0.02331 *
## windspeed -1.110e+03 3.486e+02 -3.183 0.00159 **
## as.factor(weathersit)Weather_2 -1.840e+02 6.418e+01 -2.867 0.00439 **
## as.factor(weathersit)Weather_3 -1.243e+03 1.438e+02 -8.645 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared: 0.8928, Adjusted R-squared: 0.8898
## F-statistic: 295 on 10 and 354 DF, p-value: < 2.2e-16
avPlots(stepwise_model1)
avPlots(stepwise_model2)
avPlots(stepwise_model3)
vif(stepwise_model1)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019139 1 1.009524
## as.factor(season) 3.632998 3 1.239874
## temp 3.278830 1 1.810754
## hum 1.900579 1 1.378615
## windspeed 1.199210 1 1.095085
## as.factor(weathersit) 1.819527 2 1.161421
vif(stepwise_model2)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019139 1 1.009524
## as.factor(season) 3.632998 3 1.239874
## temp 3.278830 1 1.810754
## hum 1.900579 1 1.378615
## windspeed 1.199210 1 1.095085
## as.factor(weathersit) 1.819527 2 1.161421
vif(stepwise_model3)
## GVIF Df GVIF^(1/(2*Df))
## casual 3.575021 1 1.890773
## as.factor(workingday) 2.104307 1 1.450623
## as.factor(season) 3.879645 3 1.253522
## temp 4.203539 1 2.050253
## hum 1.922350 1 1.386488
## windspeed 1.248688 1 1.117447
## as.factor(weathersit) 1.881977 2 1.171261
validation_data$predicted_registered <- predict(stepwise_model1, newdata = validation_data)
validation_data$predicted_sqrtcasual <- predict(stepwise_model2, newdata = validation_data)
validation_data$predicted_count <- predict(stepwise_model3, newdata = validation_data)
observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$predicted_registered
# Calculate different prediction performance metrics
# Functions from the MLmetrics package
# Common regression metrics
# Calculate the Root Mean Squared Error (RMSE), which measures the ...
# ... average magnitude of prediction errors.
# Lower is better.
rmse1 <- RMSE(predicted_values1, observed_values1)
# same as
# sqrt(mean((observed_values - predicted_values)^2))
# Compute the Mean Absolute Error (MAE), indicating the average absolute ...
# ... difference between predicted and observed values.
# Lower is better.
mae1 <- MAE(predicted_values1, observed_values1)
# same as
# mean(abs(observed_values - predicted_values))
# Calculate the Mean Absolute Percentage Error (MAPE), measuring the ...
# ... average percentage difference between predicted and observed values.
mape1 <- MAPE(predicted_values1, observed_values1)
# same as
# mean(abs(predicted_values-observed_values)/observed_values)
# Determine the R-squared (R²) Score, representing the proportion of the ...
# ... variance in the observed values (of validation data set) ...
# ... explained by the predicted values from the model.
# Higher is better.
r_squared1 <- R2_Score(predicted_values1, observed_values1)
# same as
# summary(lm(observed_values ~ predicted_values))$r.squared
# Display the calculated metrics
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1931.118
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1799.516
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.8432
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6396
#second question
observed_values2 <- sqrt(validation_data$casual) #viki: should I transform it here?? kaylee: I think so
predicted_values2 <- validation_data$predicted_sqrtcasual
rmse2 <- RMSE(predicted_values2, observed_values2)
mae2 <- MAE(predicted_values2, observed_values2)
mape2 <- MAPE(predicted_values2, observed_values2)
r_squared2 <- R2_Score(predicted_values2, observed_values2)
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.6403
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9599
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.5814
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2125
#third question
observed_values3 <- validation_data$count
predicted_values3 <- validation_data$predicted_count
rmse3 <- RMSE(predicted_values3, observed_values3)
mae3 <- MAE(predicted_values3, observed_values3)
mape3 <- MAPE(predicted_values3, observed_values3)
r_squared3 <- R2_Score(predicted_values3, observed_values3)
## Warning in mean.default(y_true): argument is not numeric or logical: returning
## NA
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): NaN
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): NaN
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: NaN
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): NaN
we choose model 2 because it has best RMSE and R^2 (it is done for whole 2011 year)
m.mlr <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season)+ temp + hum + windspeed + as.factor(weathersit),
data = training_data)
diagnostics_df <- data.frame(
Residuals = resid(m.mlr),
Fitted_Values = fitted(m.mlr),
Standardized_Residuals = rstandard(m.mlr),
Leverage = hatvalues(m.mlr)
#Date = year2011$dteday
)
# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = Residuals)) +
geom_point(col="blue", alpha=0.75) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "Residuals") +
theme_bw()
# Create the QQ plot
ggplot(diagnostics_df, aes(sample = Standardized_Residuals)) +
stat_qq(aes(sample = Standardized_Residuals), distribution = qnorm,
size = 2, col="blue", alpha = 0.75) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Standardized Residual QQ Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_bw()
# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = sqrt(abs(Standardized_Residuals)))) +
geom_point(col="blue", alpha=0.75) +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
theme_bw()
# Leverage vs Standardized Residuals
ggplot(diagnostics_df, aes(x = Leverage, y = Standardized_Residuals)) +
geom_point(alpha = 0.75) +
labs(title = "Standardized Residuals vs. Leverage Plot",
x = "Leverage", y = "Standardized Residuals") +
theme_bw()
validation2012<- subset(validation_data)
validation2012$predicted_casual <- predict(m.mlr, newdata = validation2012)
summary(validation2012$predicted_casual)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.625 18.851 25.486 24.806 29.947 45.683
# Extract observed and predicted values
observed_values <- sqrt(validation2012$casual)
predicted_values <- validation2012$predicted_casual
rmse <- RMSE(predicted_values, observed_values)
mae <- MAE(predicted_values, observed_values)
mape <- MAPE(predicted_values, observed_values)
r_squared <- R2_Score(predicted_values, observed_values)
cat("Root Mean Squared Error (RMSE):", round(rmse, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.6403
cat("Mean Absolute Error (MAE):", round(mae, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9599
cat("R-squared (R^2) Score:", round(r_squared, digits = 4), "\n")
## R-squared (R^2) Score: 0.5814
cat("Mean Absolute Percentage Error (MPE):", round(mape, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2125
ggplot(validation2012, aes(x = observed_values, y = predicted_values)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(x = "Observed Values", y = "Predicted Values",
title = "Observed vs. Predicted Values") +
theme_bw()
# Residuals plot
ggplot(validation2012, aes(x = 1:nrow(validation2012), y = observed_values-predicted_values)) +
geom_point() +
geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
labs(x = "Observation Index", y = "Residuals",
title = "Observed vs. Predicted Values") +
theme_bw()
head(training_data$dteday)
## [1] "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" "2011-01-05"
## [6] "2011-01-06"
# Create a new data frame 'tempdataset' by selecting specific columns from 'training_data'
# For 'Date' column: training_data$dteday contains the date in "dd/mm/yyyy" format.
# We use the as.Date function and specify the format as "%d/%m/%Y" to match ...
tempdataset <- data.frame(Date = as.Date(training_data$dteday, format = "%d/%m/%Y"),
CasualUsers = training_data$casual,
WorkingDay = training_data$workingday,
Season = training_data$season,
Temperature = training_data$temp,
Humidity = training_data$hum,
WindSpeed = training_data$windspeed,
Weathersit = training_data$weathersit)
# Ensure there are no missing values
Dataset <- tempdataset[complete.cases(tempdataset), ]
# Remove the 'tempdataset' data frame from the R environment
rm(tempdataset)
# Add a square root transformation of the CasualUsers column
Dataset$SqrtCasualUsers <- sqrt(Dataset$CasualUsers)
# Fit the model
m.mlr <- lm(SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) +
Temperature + Humidity + WindSpeed + as.factor(Weathersit),
data = Dataset)
# Check the model's summary
summary(m.mlr)
##
## Call:
## lm(formula = SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) +
## Temperature + Humidity + WindSpeed + as.factor(Weathersit),
## data = Dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5673 -2.9959 -0.0408 2.7551 15.8412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.9546 1.7664 11.297 < 2e-16 ***
## as.factor(WorkingDay)Workingday -11.1234 0.5437 -20.459 < 2e-16 ***
## as.factor(Season)Spring 6.3291 0.9419 6.719 7.29e-11 ***
## as.factor(Season)Summer 3.4574 1.2358 2.798 0.00543 **
## as.factor(Season)Fall 4.4509 0.8344 5.334 1.71e-07 ***
## Temperature 31.9733 2.3927 13.363 < 2e-16 ***
## Humidity -6.0419 2.3220 -2.602 0.00966 **
## WindSpeed -14.0995 3.5681 -3.951 9.37e-05 ***
## as.factor(Weathersit)Weather_2 -2.1217 0.6656 -3.188 0.00156 **
## as.factor(Weathersit)Weather_3 -8.7984 1.4791 -5.948 6.49e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.78 on 355 degrees of freedom
## Multiple R-squared: 0.8, Adjusted R-squared: 0.7949
## F-statistic: 157.8 on 9 and 355 DF, p-value: < 2.2e-16
library(ggplot2)
# Time series modeling
# Create a variable indicating observations ordered by time
Dataset$Time_Seq <- order(Dataset$Date)
# Explore the correlation between GroundCO and Time
ggplot(Dataset, aes(x = Time_Seq, y = SqrtCasualUsers)) +
geom_point(color = 'blue', alpha = 0.75) +
geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
labs(x = "Time", y = "SqrtCasualUsers", title = "Time vs. SqrtCasualUsers") +
theme_bw()
Based on the plot of the relationship between the transformed response variable - the square root of casual users - and the Time, such transformation helps in stabilizing the variance and makes the data more normally distributed.
# Perform Multiple Linear Regression with Time_Seq as a predictor
m.mlr.time <- lm(SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) +
Temperature + Humidity + WindSpeed + as.factor(Weathersit) + Time_Seq, data = Dataset)
# Examine the model fit for m.mlr.time
summary(m.mlr.time)
##
## Call:
## lm(formula = SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) +
## Temperature + Humidity + WindSpeed + as.factor(Weathersit) +
## Time_Seq, data = Dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5234 -2.9870 -0.0344 2.7588 15.7924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.020403 1.775593 11.275 < 2e-16 ***
## as.factor(WorkingDay)Workingday -11.125996 0.544372 -20.438 < 2e-16 ***
## as.factor(Season)Spring 6.373544 0.949130 6.715 7.50e-11 ***
## as.factor(Season)Summer 3.647400 1.319751 2.764 0.00601 **
## as.factor(Season)Fall 4.839480 1.257157 3.850 0.00014 ***
## Temperature 32.098287 2.414520 13.294 < 2e-16 ***
## Humidity -5.926959 2.341291 -2.531 0.01179 *
## WindSpeed -14.141652 3.573771 -3.957 9.17e-05 ***
## as.factor(Weathersit)Weather_2 -2.150941 0.670078 -3.210 0.00145 **
## as.factor(Weathersit)Weather_3 -8.820967 1.481868 -5.953 6.35e-09 ***
## Time_Seq -0.001829 0.004422 -0.414 0.67937
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.785 on 354 degrees of freedom
## Multiple R-squared: 0.8001, Adjusted R-squared: 0.7944
## F-statistic: 141.7 on 10 and 354 DF, p-value: < 2.2e-16
#get the AIC BIC R^2
# Summary of the model
summary_mmlr_time <- summary(m.mlr.time)
# Extract AIC and BIC
aic_mmlr_time <- AIC(m.mlr.time)
bic_mmlr_time <- BIC(m.mlr.time)
# Extract R-squared and Adjusted R-squared
rsquared_mmlr_time <- summary_mmlr_time$r.squared
adjrsquared_mmlr_time <- summary_mmlr_time$adj.r.squared
# Print the AIC, BIC, R-squared and Adjusted R-squared
cat("AIC:", aic_mmlr_time, "\n")
## AIC: 2191.531
cat("BIC:", bic_mmlr_time, "\n")
## BIC: 2238.329
cat("R-squared:", rsquared_mmlr_time, "\n")
## R-squared: 0.8000885
cat("Adjusted R-squared:", adjrsquared_mmlr_time, "\n")
## Adjusted R-squared: 0.7944412
# Time Series: Autocorrelation function
# We shall now study the correlation of the response with respect to time
# - Dataset$SqrtCasualUsers: Time series data for which autocorrelation is calculated
# - lag.max: Maximum number of lags to compute autocorrelation (default is often set to 10)
# - plot: Boolean, whether to plot the autocorrelation function (TRUE or FALSE)
acf(Dataset$SqrtCasualUsers, lag.max = 100, plot = TRUE)
acf(residuals(m.mlr.time)[Dataset$Time_Seq], lag.max = 100, plot = TRUE)
In summary, this ACF plot suggests that there is some autocorrelation
in the early lags of the SqrtCasualUsers
time series, but it quickly diminishes, implying that recent past values
(especially the immediate last value) are somewhat informative in
predicting the current value, but this influence diminishes as you go
further back in time. We will consider GLS with AR(1) because we have
multiple predictors, and there may be autocorrelation in the residuals
of a multiple regression model, we will use a GLS model with AR(1)
correlation.
#install.packages("nlme")
library(nlme)
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
# Run Generalized Least Squares (GLS) with corAR1 model (Auto-Regressive 1)
# Note: This might take some time, depending on your computer and memory
m.gls <- gls(SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + Temperature + Humidity + WindSpeed + as.factor(Weathersit),
data = Dataset,
correlation = corAR1(form = ~ Time_Seq),
method = "ML") # Maximum Likelihood estimation
# Display summary information about the GLS model
summary(m.gls)
## Generalized least squares fit by maximum likelihood
## Model: SqrtCasualUsers ~ as.factor(WorkingDay) + as.factor(Season) + Temperature + Humidity + WindSpeed + as.factor(Weathersit)
## Data: Dataset
## AIC BIC logLik
## 2106.345 2153.144 -1041.173
##
## Correlation Structure: AR(1)
## Formula: ~Time_Seq
## Parameter estimate(s):
## Phi
## 0.4700105
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 21.184330 1.837644 11.527985 0.0000
## as.factor(WorkingDay)Workingday -10.084272 0.502708 -20.059904 0.0000
## as.factor(Season)Spring 6.645981 1.406163 4.726325 0.0000
## as.factor(Season)Summer 4.382162 1.758259 2.492330 0.0131
## as.factor(Season)Fall 4.503845 1.271533 3.542058 0.0004
## Temperature 29.864538 3.206705 9.313155 0.0000
## Humidity -7.567471 2.174009 -3.480884 0.0006
## WindSpeed -15.323345 3.139098 -4.881449 0.0000
## as.factor(Weathersit)Weather_2 -2.260544 0.570305 -3.963743 0.0001
## as.factor(Weathersit)Weather_3 -8.500411 1.304037 -6.518536 0.0000
##
## Correlation:
## (Intr) a.(WD) as.fctr(Ssn)Sp as.fctr(Ssn)Sm
## as.factor(WorkingDay)Workingday -0.163
## as.factor(Season)Spring 0.029 0.003
## as.factor(Season)Summer 0.111 0.019 0.702
## as.factor(Season)Fall -0.096 0.008 0.583 0.589
## Temperature -0.388 -0.050 -0.563 -0.747
## Humidity -0.650 0.014 -0.032 0.014
## WindSpeed -0.461 0.029 0.039 0.099
## as.factor(Weathersit)Weather_2 0.306 -0.059 -0.016 -0.009
## as.factor(Weathersit)Weather_3 0.284 -0.058 -0.014 -0.041
## a.(S)F Tmprtr Humdty WndSpd a.(W)W_2
## as.factor(WorkingDay)Workingday
## as.factor(Season)Spring
## as.factor(Season)Summer
## as.factor(Season)Fall
## Temperature -0.360
## Humidity -0.086 -0.114
## WindSpeed 0.122 -0.072 0.215
## as.factor(Weathersit)Weather_2 0.026 0.080 -0.552 -0.184
## as.factor(Weathersit)Weather_3 -0.043 0.085 -0.432 -0.218 0.437
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -3.45133400 -0.64465501 -0.01827247 0.62272981 3.45941760
##
## Residual standard error: 4.749606
## Degrees of freedom: 365 total; 355 residual
# Diagnostics and plots for the time series model
# Autocorrelation plot of residuals
acf(residuals(m.gls))
# Calculate predictions
predictions <- fitted(m.gls)
# Calculate residuals
residuals <- Dataset$SqrtCasualUsers - predictions
# Calculate RMSE
rmse <- sqrt(mean(residuals^2))
# Extract log-likelihood to compute AIC and BIC
loglik <- logLik(m.gls)
# Calculate AIC
aic <- AIC(m.gls)
# Calculate BIC
bic <- BIC(m.gls)
# Output the results
list(RMSE = rmse, AIC = aic, BIC = bic)
## $RMSE
## [1] 4.752788
##
## $AIC
## [1] 2106.345
##
## $BIC
## [1] 2153.144
######For the R-squared and Adjusted R-squared
# Total Sum of Squares
SST <- sum((Dataset$SqrtCasualUsers - mean(Dataset$SqrtCasualUsers))^2)
# Residual Sum of Squares
RSS <- sum(residuals^2)
# R-squared
r_squared <- 1 - (RSS/SST)
# Adjusted R-squared
n <- nrow(Dataset) # Number of observations
p <- length(coef(m.gls)) # Number of predictors including intercept
adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
# Output R-squared and Adjusted R-squared
list(R_squared = r_squared, Adjusted_R_squared = adj_r_squared)
## $R_squared
## [1] 0.7966812
##
## $Adjusted_R_squared
## [1] 0.7909377
library(nlme)
#Extract Residuals and Fitted Values:
fitted_vals <- fitted(m.gls)
residuals_vals <- residuals(m.gls)
plot(fitted_vals, residuals_vals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red", lty = 2)
#QQ plot for standardized residual plot
qqnorm(residuals_vals, main = "Q-Q Residuals")
qqline(residuals_vals, col = "red", lty = 2)
#Scale-Location Plot (Spread vs Level):
# First, ensure there are no missing values in the residuals or the fitted values
resid <- residuals(m.gls, type = "pearson") # Pearson residuals are standardized
fitted_vals <- fitted(m.gls)
# Since we're plotting sqrt of abs(residuals), ensure no NAs are produced in the process
# Calculate the square root of the absolute standardized residuals
sqrt_abs_std_resid <- sqrt(abs(resid))
# Now create the Scale-Location plot
plot(fitted_vals, sqrt_abs_std_resid,
xlab = "Fitted Values",
ylab = "Sqrt of Absolute Standardized Residuals",
main = "Scale-Location")
abline(h = 0, col = "red", lty = 2)
Vtempdataset <- data.frame(Date = as.Date(validation_data$dteday, format = "%d/%m/%Y"),
CasualUsers = validation_data$casual,
WorkingDay = validation_data$workingday,
Season = validation_data$season,
Temperature = validation_data$temp,
Humidity = validation_data$hum,
WindSpeed = validation_data$windspeed,
Weathersit = validation_data$weathersit)
# Ensure there are no missing values
VDataset <- Vtempdataset[complete.cases(Vtempdataset), ]
# Remove the 'tempdataset' data frame from the R environment
rm(Vtempdataset)
# Add a square root transformation of the CasualUsers column
VDataset$SqrtCasualUsers <- sqrt(VDataset$CasualUsers)
library(ggplot2)
# Time series modeling
# Create a variable indicating observations ordered by time
VDataset$Time_Seq <- order(VDataset$Date)
# Explore the correlation between GroundCO and Time
ggplot(VDataset, aes(x = Time_Seq, y = SqrtCasualUsers)) +
geom_point(color = 'blue', alpha = 0.75) +
geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
labs(x = "Time", y = "SqrtCasualUsers", title = "Time vs. SqrtCasualUsers") +
theme_bw()
# Predict using the validation dataset
VDataset$PredictedSqrtCasualUsers <- predict(m.gls, newdata = VDataset)
# Plot of fitted values against observed square root values from the validation dataset
plot(VDataset$PredictedSqrtCasualUsers, VDataset$SqrtCasualUsers,
xlab = "Predicted Fitted Values", ylab = "Observed Sqrt Casual Users",
main = "Predicted vs. Observed Values")
abline(a = 0, b = 1, col = "red", lty = 2)
# For plotting residuals over time, you need to calculate residuals for the validation dataset.
# Residuals are the differences between the observed and predicted values.
VDataset$Residuals <- VDataset$SqrtCasualUsers - VDataset$PredictedSqrtCasualUsers
# Plot of residuals over time
# Assuming you have a time sequence variable in your validation dataset named Time_Seq
plot(VDataset$Time_Seq, VDataset$Residuals,
xlab = "Time", ylab = "Residuals",
main = "Residuals over Time")
model1 <- lm(registered ~ as.factor(workingday)+ as.factor(weathersit) + temp + windspeed,
data = training_data)
summary(model1) #r^2 = 0.6532
##
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(weathersit) +
## temp + windspeed, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1804.28 -451.16 -70.39 515.08 1394.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1065.70 137.48 7.752 9.41e-14 ***
## as.factor(workingday)Workingday 704.89 70.89 9.943 < 2e-16 ***
## as.factor(weathersit)Weather_2 -292.24 70.28 -4.158 4.01e-05 ***
## as.factor(weathersit)Weather_3 -1207.67 168.55 -7.165 4.44e-12 ***
## temp 3606.33 174.57 20.659 < 2e-16 ***
## windspeed -2227.25 431.26 -5.165 4.00e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 624.3 on 359 degrees of freedom
## Multiple R-squared: 0.658, Adjusted R-squared: 0.6532
## F-statistic: 138.1 on 5 and 359 DF, p-value: < 2.2e-16
#model2
model2 <- lm(sqrt(casual)~as.factor(season) + as.factor(weathersit) + as.factor(workingday),data = training_data)
summary(model2) #r^2 = 0.6873
##
## Call:
## lm(formula = sqrt(casual) ~ as.factor(season) + as.factor(weathersit) +
## as.factor(workingday), data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9963 -3.5717 0.0082 3.6853 19.3328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5236 0.7917 28.449 < 2e-16 ***
## as.factor(season)Spring 14.3744 0.8760 16.409 < 2e-16 ***
## as.factor(season)Summer 17.1367 0.8728 19.633 < 2e-16 ***
## as.factor(season)Fall 9.4982 0.8887 10.688 < 2e-16 ***
## as.factor(weathersit)Weather_2 -3.6307 0.6656 -5.455 9.18e-08 ***
## as.factor(weathersit)Weather_3 -12.5801 1.5961 -7.882 3.93e-14 ***
## as.factor(workingday)Workingday -10.6997 0.6697 -15.977 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.902 on 358 degrees of freedom
## Multiple R-squared: 0.6925, Adjusted R-squared: 0.6873
## F-statistic: 134.4 on 6 and 358 DF, p-value: < 2.2e-16
#model3
model3 <- lm(cnt ~ temp + windspeed + hum + as.factor(workingday),data = training_data)
summary(model3) #r^2 = 0.6511
##
## Call:
## lm(formula = cnt ~ temp + windspeed + hum + as.factor(workingday),
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2891.63 -500.88 -34.64 651.15 1940.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2413.11 264.78 9.114 < 2e-16 ***
## temp 5592.41 228.62 24.461 < 2e-16 ***
## windspeed -4021.07 570.77 -7.045 9.48e-12 ***
## hum -1467.65 296.35 -4.952 1.13e-06 ***
## as.factor(workingday)Workingday -21.41 91.92 -0.233 0.816
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 814.4 on 360 degrees of freedom
## Multiple R-squared: 0.655, Adjusted R-squared: 0.6511
## F-statistic: 170.8 on 4 and 360 DF, p-value: < 2.2e-16
#Model Diagnostics: to assess assumptions and multicollinearity
#!!!Model1!!!
# Plotting added variable plots
avPlots(model1)
# Check for multicollinearity: computing variance inflation factors (VIFs)
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.015839 1 1.007888
## as.factor(weathersit) 1.035705 2 1.008809
## temp 1.023139 1 1.011503
## windspeed 1.027020 1 1.013420
# Create a data frame with the residuals and fitted values
diagnostics_df1 <- data.frame(Residuals1 = resid(model1),
Fitted_Values1 = fitted(model1),
Standardized_Residuals1 = rstandard(model1),
Leverage1 = hatvalues(model1),
Season = training_data$season,
Workingday = training_data$workingday,
Weather = training_data$weathersit)
#!!!Model2!!!
avPlots(model2)
vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## as.factor(season) 1.034514 3 1.005671
## as.factor(weathersit) 1.046385 2 1.011400
## as.factor(workingday) 1.014109 1 1.007030
diagnostics_df2 <- data.frame(Residuals2 = resid(model2),
Fitted_Values2 = fitted(model2),
Standardized_Residuals2 = rstandard(model2),
Leverage2 = hatvalues(model2),
Season = training_data$season,
Workingday = training_data$workingday,
Weather = training_data$weathersit)
#!!!Model3!!!
avPlots(model3)
vif(model3)
## temp windspeed hum
## 1.031276 1.057145 1.066509
## as.factor(workingday)
## 1.003524
diagnostics_df3 <- data.frame(Residuals3 = resid(model3),
Fitted_Values3 = fitted(model3),
Standardized_Residuals3 = rstandard(model3),
Leverage3 = hatvalues(model3),
Season = training_data$season,
Workingday = training_data$workingday,
Weather = training_data$weathersit)
plot the diagnostics
#!!!Model1!!!
# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df1, aes(x = Fitted_Values1, y = Residuals1)) +
geom_point(col="blue", alpha=0.75) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "Residuals") +
theme_bw()
# Create the QQ plot
ggplot(diagnostics_df1, aes(sample = Standardized_Residuals1)) +
stat_qq(aes(sample = Standardized_Residuals1), distribution = qnorm,
size = 2, col="blue", alpha = 0.75) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Standardized Residual QQ Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_bw()
# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df1, aes(x = Fitted_Values1, y = sqrt(abs(Standardized_Residuals1)))) +
geom_point(col="blue", alpha=0.75) +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
theme_bw()
# Leverage vs Standardized Residuals
ggplot(diagnostics_df1, aes(x = Leverage1, y = Standardized_Residuals1)) +
geom_point(alpha = 0.75) +
labs(title = "Standardized Residuals vs. Leverage Plot",
x = "Leverage", y = "Standardized Residuals") +
theme_bw()
#!!!Model2!!!
#residual vs fitted values
ggplot(diagnostics_df2, aes(x = Fitted_Values2, y = Residuals2)) +
geom_point(col="blue", alpha=0.75) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "Residuals") +
theme_bw()
#QQ plot
ggplot(diagnostics_df2, aes(sample = Standardized_Residuals2)) +
stat_qq(aes(sample = Standardized_Residuals2), distribution = qnorm,
size = 2, col="blue", alpha = 0.75) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Standardized Residual QQ Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_bw()
#sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df2, aes(x = Fitted_Values2, y = sqrt(abs(Standardized_Residuals2)))) +
geom_point(col="blue", alpha=0.75) +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
theme_bw()
# Leverage vs Standardized Residuals
ggplot(diagnostics_df2, aes(x = Leverage2, y = Standardized_Residuals2)) +
geom_point(alpha = 0.75) +
labs(title = "Standardized Residuals vs. Leverage Plot",
x = "Leverage", y = "Standardized Residuals") +
theme_bw()
#!!!Model3!!!
#residual vs fitted values
ggplot(diagnostics_df3, aes(x = Fitted_Values3, y = Residuals3)) +
geom_point(col="blue", alpha=0.75) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "Residuals") +
theme_bw()
#QQ plot
ggplot(diagnostics_df3, aes(sample = Standardized_Residuals3)) +
stat_qq(aes(sample = Standardized_Residuals3), distribution = qnorm,
size = 2, col="blue", alpha = 0.75) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Standardized Residual QQ Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_bw()
#sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df3, aes(x = Fitted_Values3, y = sqrt(abs(Standardized_Residuals3)))) +
geom_point(col="blue", alpha=0.75) +
labs( title = "Residuals vs. Fitted Values",
x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
theme_bw()
# Leverage vs Standardized Residuals
ggplot(diagnostics_df3, aes(x = Leverage3, y = Standardized_Residuals3)) +
geom_point(alpha = 0.75) +
labs(title = "Standardized Residuals vs. Leverage Plot",
x = "Leverage", y = "Standardized Residuals") +
theme_bw()
#Prediction: training and validation
#Model1 - training
#model1
training_data$Predicted_registered <- predict(model1, training_data)
# Extract observed and predicted values
observed_values1 <- training_data$registered
predicted_values1 <- training_data$Predicted_registered
# Calculate different prediction performance metrics
#rmse
rmse1 <- RMSE(predicted_values1, observed_values1)
#mae
mae1 <- MAE(predicted_values1, observed_values1)
#mape
mape1 <- MAPE(predicted_values1, observed_values1)
#r^2
r_squared1 <- R2_Score(predicted_values1, observed_values1)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 619.1167
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 519.1348
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: 0.658
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2562
# Root Mean Squared Error (RMSE): 619.1167
# Mean Absolute Error (MAE): 519.1348
# R-squared (R^2) Score: 0.658
# Mean Absolute Percentage Error (MPE): 0.2562
#Model1 - validation
validation_data$Predicted_registered <- predict(model1, validation_data)
# Extract observed and predicted values
observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$Predicted_registered
# Calculate different prediction performance metrics
#rmse
rmse1 <- RMSE(predicted_values1, observed_values1)
#mae
mae1 <- MAE(predicted_values1, observed_values1)
#mape
mape1 <- MAPE(predicted_values1, observed_values1)
#r^2
r_squared1 <- R2_Score(predicted_values1, observed_values1)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1993.671
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1805.783
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.9646
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.5715
# Root Mean Squared Error (RMSE): 1993.671
# Mean Absolute Error (MAE): 1805.783
# R-squared (R^2) Score: -0.9646
# Mean Absolute Percentage Error (MPE): 0.5715
#Model2 - training
# Extract observed and predicted values
sqrt_casual <- sqrt(training_data$casual)
observed_values2 <- sqrt_casual
training_data$Predicted_sqrt_casual <- predict(model2, training_data)
predicted_values2 <- training_data$Predicted_sqrt_casual
#rmse
rmse2 <- RMSE(predicted_values2, observed_values2)
#mae
mae2 <- MAE(predicted_values2, observed_values2)
#mape
mape2 <- MAPE(predicted_values2, observed_values2)
#r^2
r_squared2 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 5.8451
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 4.5223
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.6925
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2555
# Root Mean Squared Error (RMSE): 5.8451
# Mean Absolute Error (MAE): 4.5223
# R-squared (R^2) Score: 0.6925
# Mean Absolute Percentage Error (MPE): 0.2555
#Model2 - validation
# Extract observed and predicted values
sqrt_casual <- sqrt(validation_data$casual)
observed_values2 <- sqrt_casual
validation_data$Predicted_sqrt_casual <- predict(model2, validation_data)
predicted_values2 <- validation_data$Predicted_sqrt_casual
#rmse
rmse2 <- RMSE(predicted_values2, observed_values2)
#mae
mae2 <- MAE(predicted_values2, observed_values2)
#mape
mape2 <- MAPE(predicted_values2, observed_values2)
#r^2
r_squared2 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 8.9667
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 6.9653
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.4235
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2525
# Root Mean Squared Error (RMSE): 8.9667
# Mean Absolute Error (MAE): 6.9653
# R-squared (R^2) Score: 0.4235
# Mean Absolute Percentage Error (MPE): 0.2525
#Model 3 - training
observed_values3 <- training_data$cnt
training_data$Predicted_cnt <- predict(model3, newdata = training_data)
predicted_values3 <- training_data$Predicted_cnt
#rmse
rmse3 <- RMSE(predicted_values3, observed_values3)
#mae
mae3 <- MAE(predicted_values3, observed_values3)
#mape
mape3 <- MAPE(predicted_values3, observed_values3)
#r^2
r_squared3 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 808.7555
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): 649.4414
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: 0.4235
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2749
# Root Mean Squared Error (RMSE): 808.7555
# Mean Absolute Error (MAE): 649.4414
# R-squared (R^2) Score: 0.4235
# Mean Absolute Percentage Error (MPE): 0.2749
#Model 3 - validation
observed_values3 <- validation_data$cnt
validation_data$Predicted_cnt <- predict(model3, newdata = validation_data)
predicted_values3 <- validation_data$Predicted_cnt
#rmse
rmse3 <- RMSE(predicted_values3, observed_values3)
#mae
mae3 <- MAE(predicted_values3, observed_values3)
#mape
mape3 <- MAPE(predicted_values3, observed_values3)
#r^2
r_squared3 <- R2_Score(predicted_values2, observed_values2)
#display
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 2373.45
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): 2119.981
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: 0.4235
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6334
# Root Mean Squared Error (RMSE): 2373.45
# Mean Absolute Error (MAE): 2119.981
# R-squared (R^2) Score: 0.4235
# Mean Absolute Percentage Error (MPE): 0.6334